## Callin Switzer
## 29 Nov 2016
## 8 Dec 2016 Update -- re-digitized pollen and anthers very carefully
## Use cross validation to choose smoothing parameter, using a smoothing spline
## 8 Feb 2017, cleaned up code for submission of paper

# use ten -fold cross-validation to decide smoothing parameters or plot noise/resoluation tradeoff

Setup

ipak <- function(pkg){
     new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
     if(length(new.pkg)) install.packages(new.pkg, dependencies = TRUE)
     sapply(pkg, require, character.only = TRUE)
}

packages <- c("ggplot2", "scales", "multcomp", "plyr", "car", "lme4", "signal")

ipak(packages)
##  ggplot2   scales multcomp     plyr      car     lme4   signal 
##     TRUE     TRUE     TRUE     TRUE     TRUE     TRUE     TRUE
# set random state
set.seed(12345)

# print session info and time the code was run last
Sys.time()
## [1] "2017-03-15 09:30:23 EDT"
sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X El Capitan 10.11.6
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] signal_0.7-6    lme4_1.1-12     Matrix_1.2-8    car_2.1-4      
##  [5] plyr_1.8.4      multcomp_1.4-6  TH.data_1.0-8   MASS_7.3-45    
##  [9] survival_2.40-1 mvtnorm_1.0-5   scales_0.4.1    ggplot2_2.2.1  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.9        nloptr_1.0.4       tools_3.3.2       
##  [4] digest_0.6.12      evaluate_0.10      tibble_1.2        
##  [7] gtable_0.2.0       nlme_3.1-130       lattice_0.20-34   
## [10] mgcv_1.8-16        yaml_2.1.14        parallel_3.3.2    
## [13] SparseM_1.74       stringr_1.1.0      knitr_1.15.1      
## [16] MatrixModels_0.4-1 rprojroot_1.2      grid_3.3.2        
## [19] nnet_7.3-12        rmarkdown_1.3      minqa_1.2.4       
## [22] magrittr_1.5       backports_1.0.5    codetools_0.2-15  
## [25] htmltools_0.3.5    splines_3.3.2      assertthat_0.1    
## [28] pbkrtest_0.4-6     colorspace_1.3-2   quantreg_5.29     
## [31] sandwich_2.3-4     stringi_1.1.2      lazyeval_0.2.0    
## [34] munsell_0.4.3      zoo_1.7-14
dataDirectory = '/Users/callinswitzer/Dropbox/ExperSummer2016/Kalmia/Manuscript/DatasetsSupplemental/'

# read in metadata
dfile <- "LaurelsOnly.csv"
metDat <- read.csv(paste0(dataDirectory, dfile))

metDat <- metDat[metDat$redigitizedFile!= "", ]

# set constants:
fps <- 5000 # frames per second


# read in each .csv file for analysis
# make a list of data frames
digdirect <- "/Users/callinswitzer/Dropbox/ExperSummer2016/Kalmia/Manuscript/DatasetsSupplemental/CleanKalmiaDigitized/"

10-fold CV for anthers

Note: We don’t report anther stuff in the paper, because it’s basically the same as the pollen stuff.

# not run
newDF <- data.frame()

bestSmthAnth <- numeric()

for(ii in 1:nrow(metDat)){
     
     ddfile <- paste0(digdirect, metDat$redigitizedFile[ii])
     
     dp <- read.csv(ddfile)

     # get anther and pollen locations
     antherPoll <- data.frame(anthx = dp$pt3_cam1_X, anthy= dp$pt3_cam1_Y, 
                              polx = dp$pt4_cam1_X, poly= dp$pt4_cam1_Y)
     # gives only rows where either anth and pollen are complete
     antherPoll = antherPoll[ complete.cases(antherPoll[c('anthx')]) | 
                                   complete.cases(antherPoll[c('polx')]), ]
     
     # if x value starts to right of screen, reverse points,
     # so all x values start on the left part of the screen at 0
     if(lm(antherPoll[,1] ~ I(1:length( antherPoll[,1])))$coefficients[2] < 0 ){
          antherPoll$anthx <- metDat[ii,'vidWidth'] - antherPoll$anthx
          antherPoll$polx <- metDat[ii,'vidWidth'] - antherPoll$polx
     }
     
     
     
     # visualize groups for cv
     anther <- antherPoll[,c("anthx", "anthy") ]
     anther <- anther[complete.cases(anther), ]
     anther$sampNum <- sample(rep(sample(1:10), length = nrow(anther)))
     anther$time1 <- 1:nrow(anther)
     
     
     newL <- data.frame()
     for(spp in seq(0.001, to = 0.49, by  = 0.01)){
          sumErr <- numeric()
          for(cvg in 1:10){
               xy <- anther
               colnames(xy) <- c("x", "y", "sampeNum", "time1")
               xy$y[anther$sampNum == cvg] <- xy$x[anther$sampNum == cvg]<-  NA
               xy = xy[complete.cases(xy), ]
               #spp <- 0.001
               smy <- smooth.spline(y = xy$y, x = xy$time1, spar = spp)
               smx <- smooth.spline(y = xy$x, x = xy$time1, spar = spp)
               # plot(y = smy$y, x = smx$y, type = 'n')
               # points(xy$x, xy$y, pch = 20) # actual values
               newPredsx = predict(object = smx, x = anther$time1[anther$sampNum == cvg])
               newPredsy = predict(object = smy, x = anther$time1[anther$sampNum == cvg])
               # points(newPredsx$y, y = newPredsy$y, pch = 20, col = 'red') # predicted gaps
               # points(x = anther$anthx[anther$sampNum == cvg], 
               #        y =  anther$anthy[anther$sampNum == cvg],col = 'grey') # actual gaps
               # legend("bottomright", legend = c("raw data", "predicted gaps", "observed gaps"), 
               #        pch = c(20, 20, 1), col = c('black', 'red', 'grey'))
               
               # add up differences -- predicted - observed
               sumdiffs <- data.frame(xObs = anther$anthx[anther$sampNum == cvg],  xPred = newPredsx$y,  
                                      yObs = anther$anthy[anther$sampNum == cvg], yPred = newPredsy$y)
               
               # find distance between points
               sumdiffs$dists = apply(X = sumdiffs, MARGIN = 1, function(x) {
                    return(sqrt((x[1] - x[2])^2 + (x[3] - x[4])^2))
               })
               
               # sum of errors / total number of predicted points
               sumErr[cvg] <- sum(sumdiffs$dists) / nrow(sumdiffs)
          }
          newL <- rbind(newL, c(spp, sum(sumErr) / 10))
     }
     colnames(newL) <- c("smoothingParameter", "sumOfErrors")
     newL
     plot(newL)
     
     bestSmthAnth[ii] <- newL$smoothingParameter[which.min(newL$sumOfErrors)]
}   

hist(bestSmthAnth)
abline(v = mean(bestSmthAnth))
mean(bestSmthAnth) 
median(bestSmthAnth) #,0.291,  maybe go with median, since the dist is skewed

### END OF 10-fold CV for anthers

10-fold CV for pollen

bestSmthPol <- numeric()

for(ii in 1:nrow(metDat)){
     
     ddfile <- paste0(digdirect, metDat$redigitizedFile[ii])
     
     dp <- read.csv(ddfile)
     
     # get anther and pollen locations
     antherPoll <- data.frame(anthx = dp$pt3_cam1_X, anthy= dp$pt3_cam1_Y, 
                              polx = dp$pt4_cam1_X, poly= dp$pt4_cam1_Y)
     # gives only rows where either anth and pollen are complete
     antherPoll = antherPoll[ complete.cases(antherPoll[c('anthx')]) | 
                                   complete.cases(antherPoll[c('polx')]), ]
     
     # if x value starts to right of screen, reverse points,
     # so all x values start on the left part of the screen at 0
     if(lm(antherPoll[,1] ~ I(1:length( antherPoll[,1])))$coefficients[2] < 0 ){
          antherPoll$anthx <- metDat[ii,'vidWidth'] - antherPoll$anthx
          antherPoll$polx <- metDat[ii,'vidWidth'] - antherPoll$polx
     }
     
     
     
     # visualize groups for cv
     pollen <- antherPoll[,c("polx", "poly") ]
     pollen <- pollen[complete.cases(pollen), ]
     pollen$sampNum <- sample(rep(sample(1:10), length = nrow(pollen)))
     pollen$time1 <- 1:nrow(pollen)
     
     
     newL <- data.frame()
     for(spp in seq(0.001, to = 0.49, by  = 0.01)){
          sumErr <- numeric()
          for(cvg in 1:10){
               xy <- pollen
               colnames(xy) <- c("x", "y", "sampeNum", "time1")
               xy$y[pollen$sampNum == cvg] <- xy$x[pollen$sampNum == cvg]<-  NA
               xy = xy[complete.cases(xy), ]
               #spp <- 0.001
               smy <- smooth.spline(y = xy$y, x = xy$time1, spar = spp)
               smx <- smooth.spline(y = xy$x, x = xy$time1, spar = spp)
               # plot(y = smy$y, x = smx$y, type = 'n')
               # points(xy$x, xy$y, pch = 20) # actual values
               newPredsx = predict(object = smx, x = pollen$time1[pollen$sampNum == cvg])
               newPredsy = predict(object = smy, x = pollen$time1[pollen$sampNum == cvg])
               # points(newPredsx$y, y = newPredsy$y, pch = 20, col = 'red') # predicted gaps
               # points(x = pollen$polx[pollen$sampNum == cvg],
               #        y =  pollen$poly[pollen$sampNum == cvg],col = 'grey') # actual gaps
               # legend("bottomright", legend = c("raw data", "predicted gaps", "observed gaps"), 
               #        pch = c(20, 20, 1), col = c('black', 'red', 'grey'))
               
               # add up differences -- predicted - observed
               sumdiffs <- data.frame(xObs = pollen$polx[pollen$sampNum == cvg],  xPred = newPredsx$y,  
                                      yObs = pollen$poly[pollen$sampNum == cvg], yPred = newPredsy$y)
               
               # find distance between points
               sumdiffs$dists = apply(X = sumdiffs, MARGIN = 1, function(x) {
                    return(sqrt((x[1] - x[2])^2 + (x[3] - x[4])^2))
               })
               
               # sum of errors / total number of predicted points
               sumErr[cvg] <- sum(sumdiffs$dists) / nrow(sumdiffs)
          }
          newL <- rbind(newL, c(spp, sum(sumErr) / 10))
     }
     colnames(newL) <- c("smoothingParameter", "sumOfErrors")
     newL
     plot(newL)
     
     bestSmthPol[ii] <- newL$smoothingParameter[which.min(newL$sumOfErrors)]
}   

hist(bestSmthPol)
abline(v = mean(bestSmthPol)) # 

median(bestSmthPol) # ~0.29, go with median, since the dist is skewed
## [1] 0.291
mean(bestSmthPol)#  ~0.26
## [1] 0.259125
# Decision: smoothing parameter is 0.29

make some figures for supplemental files

theme_set(theme_classic())

ggplot(newL, aes(x = smoothingParameter, y = sumOfErrors)) + 
     geom_point() + 
     labs(x = "Smoothing Parameter", y = "Average out-of-sample error")

saveDir <- "/Users/callinswitzer/Dropbox/ExperSummer2016/Kalmia/Manuscript/Media/"
ggsave(paste0(saveDir, "outOfSampleErrors_Example.pdf"), width = 4, height = 3)


plot(newL)

# plot undersmoothed vs. oversmoothed
xy <- pollen
colnames(xy) <- c("x", "y", "sampeNum", "time1")
xy = xy[complete.cases(xy), ]
xy$sampeNum <- NULL
xy$trt <- "spar = 0"

spp <- 0.29
smy_low <- data.frame(predict(smooth.spline(y = xy$y, x = xy$time1, spar = spp)))
colnames(smy_low) <- c('time1', 'y')
smx_low <- data.frame(predict(smooth.spline(y = xy$x, x = xy$time1, spar = spp)))
colnames(smx_low) <- c('time1', 'x')

lowDF <- merge(smx_low, smy_low)
lowDF$trt <- "spar = 0.29"


spp <- 0.5
smy_high <- data.frame(predict(smooth.spline(y = xy$y, x = xy$time1, spar = spp)))
colnames(smy_high) <- c('time1', 'y')
smx_high <- data.frame(predict(smooth.spline(y = xy$x, x = xy$time1, spar = spp)))
colnames(smx_high) <- c('time1', 'x')

highDF <- merge(smx_high, smy_high)
highDF$trt <- "spar = 0.5"

longDF <- rbind(xy, lowDF, highDF)

library(viridis)

ggplot(longDF, aes(x = x, y = y, color = trt)) + 
     geom_path(alpha = 0.5) + 
     geom_point(aes(shape = trt)) + 
     xlim(c(270, 280)) +
     ylim(c(15, 30)) + 
     labs(x = "X position (pixels)", y = "Y position (pixels)") + 
     coord_fixed() + 
     theme(legend.justification = c(1, 0), legend.position = c(1, .5)) + 
     scale_color_viridis(name = "Smoothing Parameter", discrete = TRUE,end = 0.9) + 
     scale_shape_discrete(name = "Smoothing Parameter")
## Warning: Removed 66 rows containing missing values (geom_path).
## Warning: Removed 66 rows containing missing values (geom_point).

ggsave(paste0(saveDir, "SuppFig2_smoothingParameter_Example.pdf"), width = 4, height = 4)
## Warning: Removed 66 rows containing missing values (geom_path).

## Warning: Removed 66 rows containing missing values (geom_point).

Visualize raw and smoothed data to see how different it is.

spp = 0.29
for(ii in 1:nrow(metDat)){
     
     ddfile <- paste0(digdirect, metDat$redigitizedFile[ii])
     
     dp <- read.csv(ddfile)
     
     # get anther and pollen locations
     antherPoll <- data.frame(anthx = dp$pt3_cam1_X, anthy= dp$pt3_cam1_Y, 
                              polx = dp$pt4_cam1_X, poly= dp$pt4_cam1_Y)
     # gives only rows where either anth and pollen are complete
     antherPoll = antherPoll[ complete.cases(antherPoll[c('anthx')]) | 
                                   complete.cases(antherPoll[c('polx')]), ]
     
     # if x value starts to right of screen, reverse points,
     # so all x values start on the left part of the screen at 0
     if(lm(antherPoll[,1] ~ I(1:length( antherPoll[,1])))$coefficients[2] < 0 ){
          antherPoll$anthx <- metDat[ii,'vidWidth'] - antherPoll$anthx
          antherPoll$polx <- metDat[ii,'vidWidth'] - antherPoll$polx
     }
     
     
     
     # visualize groups for cv
     pollen <- antherPoll[,c("polx", "poly") ]
     pollen <- pollen[complete.cases(pollen), ]
     pollen$sampNum <- sample(rep(sample(1:10), length = nrow(pollen)))
     pollen$time1 <- 1:nrow(pollen)
     
     
     
     xy <- pollen
     colnames(xy) <- c("x", "y", "sampeNum", "time1")
     xy = xy[complete.cases(xy), ]
     smy <- smooth.spline(y = xy$y, x = xy$time1, spar = spp)
     smx <- smooth.spline(y = xy$x, x = xy$time1, spar = spp)
     plot(y = smy$y, x = smx$y, pch = 20, col = rgb(0,0,0,0.5), asp = 1)
     points(xy$x, xy$y, pch = 20, col = rgb(1,0,0, 0.5)) # actual values
     legend("bottomright", pch = c(20,20), col = c('black', 'red'), legend = c("predicted", "observed"))
     
     # Sys.sleep(0.5)
} 

visualize with anther

Not run

spp = 0.29
for(ii in 1:nrow(metDat)){
     
     ddfile <- paste0(digdirect, metDat$redigitizedFile[ii])
     
     dp <- read.csv(ddfile)
     
     # get anther and pollen locations
     antherPoll <- data.frame(anthx = dp$pt3_cam1_X, anthy= dp$pt3_cam1_Y, 
                              polx = dp$pt4_cam1_X, poly= dp$pt4_cam1_Y)
     # gives only rows where either anth and pollen are complete
     antherPoll = antherPoll[ complete.cases(antherPoll[c('anthx')]) | 
                                   complete.cases(antherPoll[c('polx')]), ]
     
     # if x value starts to right of screen, reverse points,
     # so all x values start on the left part of the screen at 0
     if(lm(antherPoll[,1] ~ I(1:length( antherPoll[,1])))$coefficients[2] < 0 ){
          antherPoll$anthx <- metDat[ii,'vidWidth'] - antherPoll$anthx
          antherPoll$polx <- metDat[ii,'vidWidth'] - antherPoll$polx
     }
     
     
     
     # visualize groups for cv
     anther <- antherPoll[,c("anthx", "anthy") ]
     anther <- anther[complete.cases(anther), ]
     anther$time1 <- 1:nrow(anther)
     
     
     
     xy <- anther
     colnames(xy) <- c("x", "y", "time1")
     xy = xy[complete.cases(xy), ]
     smy <- smooth.spline(y = xy$y, x = xy$time1, spar = spp)
     smx <- smooth.spline(y = xy$x, x = xy$time1, spar = spp)
     plot(y = smy$y, x = smx$y, pch = 20, col = rgb(0,0,0,0.5), asp = 1)
     points(xy$x, xy$y, pch = 20, col = rgb(1,0,0, 0.5)) # actual values
     legend("bottomright", pch = c(20,20), col = c('black', 'red'), legend = c("predicted", "observed"))
     
     # Sys.sleep(0.5)
}